Data Wrangling

Data Set up

property <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/property/opa_properties_public2.geojson") %>%
  st_transform('EPSG:2272')

property2 <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/property/property2.geojson") %>%
  st_transform('EPSG:2272')

property_park <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/property/property_park.geojson")%>%
  st_transform('EPSG:2272')

property2$distance_to_nearest_park <- property_park$distance_to_nearest_park

studyarea1 <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/studyarea/StudyArea.shp")%>%
  st_transform('EPSG:2272')

I676 <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/studyarea/I_676.shp")

studyarea_tract <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/studyarea/StudyArea_blockgroup_tract.shp")
philly_blockgroup <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/Philly_blockgroup/Philly_blockgroup.shp")

nhood <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/philly_neighborhoods.geojson")

landuse <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/landuse_clip/Land_Use_ClipLayer.shp")

I676_2 <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/studyarea/discontinuity.shp")%>%
  st_transform('EPSG:2272')
cpi_data <- st_read("/Users/jiajiabao/Documents/Upenn/Practicum/2025/Data/property/cpi_data.csv")

commercial<- st_read("~/Documents/Upenn/Practicum/2025/Data/Commercial_Corridors.geojson")%>%
  st_transform('EPSG:2272')
studyarea_north <- st_read("~/Documents/Upenn/Practicum/2025/Data/studyarea/studyarea_north.shp") %>%
  st_transform('EPSG:2272')

studyarea_south <- st_read("~/Documents/Upenn/Practicum/2025/Data/studyarea/studyarea_south.shp") %>%
  st_transform('EPSG:2272')

property_final <- st_read("~/Documents/Upenn/Practicum/2025/results/property_final.geojson")%>%
  st_transform('EPSG:2272')

Data Processing

property data

#### clean data for city wise - sale price
colnames(cpi_data) <- c("half1", "half2", "cpi_date")
cpi_long <- cpi_data %>%
  mutate(
    Jan = `half1`, Feb = `half1`, Mar = `half1`, Apr = `half1`, May = `half1`, Jun = `half1`,
    Jul = `half2`, Aug = `half2`, Sep = `half2`, Oct = `half2`, Nov = `half2`, Dec = `half2`
  ) %>%
  select(-half1, -half2) %>%
  pivot_longer(cols = -cpi_date, names_to = "Month", values_to = "CPI") %>%
  mutate(
    Month = match(Month, month.abb),
    date = as.Date(paste(cpi_date, Month, "01", sep = "-"))  
  ) %>%
  select(date, CPI)

property_philly <-property%>%
  mutate(sale_date = as.Date(sale_date, format = "%Y-%m-%d")) %>%
  filter(sale_date >= as.Date("2020-01-01") & sale_date <= as.Date("2024-12-31"))%>%
  filter(!is.na(sale_price), !is.na(total_livable_area))%>%
  mutate(
    sale_month = floor_date(sale_date, "month"))%>%
  left_join(cpi_long, by = c("sale_month" = "date"))%>%
  mutate(CPI = as.numeric(CPI))%>%
  mutate(adj_sale_price = sale_price * (340.331/CPI))%>%
  select(-sale_month, -recording_date,  -other_building, 
         -assessment_date, -mailing_address_2, mailing_city_state, mailing_care_of)

mean_price <- mean(property_philly$adj_sale_price, na.rm = TRUE)
sd_price <- sd(property_philly$adj_sale_price, na.rm = TRUE)
extreme_outlier <- mean_price + 5 * sd_price

property_philly <- property_philly%>%
  filter(adj_sale_price > 30000, adj_sale_price < extreme_outlier) %>%
  filter(total_livable_area >= 100)

property_philly <- st_transform(property_philly, crs = 2272)

#st_write(property_philly, "property_philly.geojson", driver = "GeoJSON", delete_dsn = TRUE)

#### clean data for study area - sale price 
#study area intersection
property_studyarea <- st_intersection(property, studyarea1)

#sale price clean up and inflation adjusted
colnames(cpi_data) <- c("half1", "half2", "cpi_date")
cpi_long <- cpi_data %>%
  mutate(
    Jan = `half1`, Feb = `half1`, Mar = `half1`, Apr = `half1`, May = `half1`, Jun = `half1`,
    Jul = `half2`, Aug = `half2`, Sep = `half2`, Oct = `half2`, Nov = `half2`, Dec = `half2`
  ) %>%
  select(-half1, -half2) %>%
  pivot_longer(cols = -cpi_date, names_to = "Month", values_to = "CPI") %>%
  mutate(
    Month = match(Month, month.abb),
    date = as.Date(paste(cpi_date, Month, "01", sep = "-"))  
  ) %>%
  select(date, CPI)

property_data <-property_studyarea%>%
  mutate(sale_date = as.Date(sale_date, format = "%Y-%m-%d")) %>%
  filter(!is.na(sale_price))%>%
  filter(sale_date <= as.Date("2024-12-31"))%>%
  mutate(
    sale_date = as.Date(sale_date, format = "%Y-%m-%d"), 
    sale_month = floor_date(sale_date, "month"))%>%
  left_join(cpi_long, by = c("sale_month" = "date"))%>%
  mutate(CPI = as.numeric(CPI))%>%
  mutate(adj_sale_price = sale_price * (340.331/CPI)) %>%
  select(-sale_month,-recording_date,  -other_building, 
         -assessment_date, -mailing_address_2, mailing_city_state, mailing_care_of, -state_code)%>%
  relocate(adj_sale_price, sale_price, .after = 2)

mean_price <- mean(property_data$adj_sale_price, na.rm = TRUE)
sd_price <- sd(property_data$adj_sale_price, na.rm = TRUE)
extreme_outlier <- mean_price + 5 * sd_price

property_data <- property_data %>%
  filter(adj_sale_price > 30000, adj_sale_price < extreme_outlier)

property1 <- st_transform(property_data, crs = 2272)

#join attributes - I676(NS) - nhood1 (neighborhoods in study area)
property1 <- rbind(
    property1 %>% st_intersection(studyarea_north["geometry"]) %>%
      mutate(I676_NS = "north"),
    property1 %>% st_intersection(studyarea_south["geometry"]) %>%
      mutate(I676_NS = "south")
  )
nhood1 <- st_intersection(nhood, studyarea1)

#Features clean up for future FE and modeling
## Zoning
property1 <- property1%>%
  select(1:2, zoning, location, everything())

property1 <- property1 %>%
  mutate(zoning = case_when(
    location == "1310-16 VINE ST" ~ "CMX4",   
    location == "252-56 N 13TH ST" ~ "CMX4",
    location == "255-57 N BROAD ST" ~ "CMX4",
    location == "451 N 12TH ST" ~ "CMX3",
    location == "453 N 12TH ST" ~"CMX3",
    location == "447 N 12TH ST" ~ "CMX3",
    location == "1108 BUTTONWOOD ST" ~ "CMX3",
    location == "1104 BUTTONWOOD ST" ~ "CMX3",
    location == "256-60 N MARVINE ST" ~ "CMX4",
    location == "820 VINE ST" ~ "CMX4",
    TRUE ~ zoning                        
  ))

## central air
target_locations_N <- c(
                      "1232 SUMMER ST", 
                      "1030 BRANDYWINE ST",
                      "214 N 12TH ST",
                      "1021 SPRING ST")

property1 <- property1 %>%
  mutate(
    central_air = ifelse(location %in% target_locations_N & central_air == "N", "Y", central_air)
  )
## year built - clean up values of 1 and delete the other rows which are parking plot properties that have no year_built data
valid_locations <- c(
  "305-09 N 12TH ST",
  "1008-16 WOOD ST",
  "1004-06 WOOD ST",
  "211 N CAMAC ST",
  "231-33 N BROAD ST",
  "225-27 N 12TH ST"
)
property1 <- property1 %>%
  mutate(year_built = as.numeric(year_built)) %>%
  mutate(
    year_built = case_when(
      location == "305-09 N 12TH ST" & year_built < 1975 ~ 1950,
      location == "1008-16 WOOD ST" & year_built < 1975 ~ 1978,
      location == "1004-06 WOOD ST" & year_built < 1975 ~ 1920,
      location == "211 N CAMAC ST" & year_built < 1975 ~ 1900,
      location == "231-33 N BROAD ST" & year_built < 1975 ~ 2020,
      location == "225-27 N 12TH ST" & year_built < 1975 ~ 2024,
      TRUE ~ year_built
    )
  ) %>%
  filter(is.na(year_built) | year_built >= 1750 | location %in% c(
    "305-09 N 12TH ST",
    "1008-16 WOOD ST",
    "1004-06 WOOD ST",
    "211 N CAMAC ST",
    "231-33 N BROAD ST",
    "225-27 N 12TH ST"
  ))

Census Data

### The method here for pushing data is diffenrent from the one we usually use cause the there is some issues when downloading the geometry data

# census block group
census_api_key("92998588496b9701036218b765c78d2ffb7aedcd", install = TRUE, overwrite = TRUE)
acs_variable_list.2023 <- load_variables(2023, "acs5", cache = TRUE)

local_shapefile <- "~/Documents/Upenn/Practicum/2025/Data/census/tl_2023_42_bg"
bg_pa <- st_read(local_shapefile, quiet = TRUE)
bg_philly <- bg_pa %>%
  filter(COUNTYFP == "101") %>%
  st_transform(2272) %>%
  select(GEOID, geometry)

block2023_attr <- get_acs(
  geography = "block group",
  variables = c("B19013_001E", "B25058_001E"),
  year = 2023,
  state = 42,
  county = 101,
  geometry = FALSE
)

block2023_attr <- block2023_attr %>%
  select(GEOID, variable, estimate) %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  rename(
    MedHHInc = B19013_001,
    MedRent = B25058_001
  )

block2023 <- left_join(bg_philly, block2023_attr, by = "GEOID")

studyarea_tract_df <- as.data.frame(studyarea_tract) %>% select(-geometry)
block2023_studyarea <- block2023 %>% inner_join(studyarea_tract_df, by = "GEOID")

#census tract
tract_pa <- st_read("~/Documents/Upenn/Practicum/2025/Data/census/tl_2023_42_tract", quiet = TRUE)

tract_philly <- tract_pa %>%
  filter(COUNTYFP == "101") %>%
  st_transform(2272) %>%
  select(GEOID, geometry)

tract_attr <- get_acs(
  geography = "tract",
  variables = c("B25026_001E", "B15001_050E", "B15001_009E","B06012_002E"),
  year = 2023, state = 42, county = 101, geometry = FALSE
)

tract_attr_wide <- tract_attr %>%
  select(GEOID, variable, estimate) %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  rename(
    TotalPop = B25026_001,
    FemaleBachelors = B15001_050,
    MaleBachelors = B15001_009,
    TotalPoverty = B06012_002
  )

tract2023 <- left_join(tract_philly, tract_attr_wide, by = "GEOID")
studyarea_tract_df <- as.data.frame(studyarea_tract) %>% select(-geometry)

tract2023 <- tract2023 %>%
  mutate(GEOID_trimmed = as.numeric(str_sub(GEOID, 7, 9)))

studyarea_tract_df <- studyarea_tract_df %>%
  mutate(GEOID_trimmed = as.numeric(str_sub(TRACT, 2, 4)))

tract2023_studyarea <- tract2023 %>%
  inner_join(studyarea_tract_df, by = "GEOID_trimmed")
tract_studyarea <- st_intersection(studyarea1, tract2023_studyarea)

# Property1_property2_census_join
property1 <- property1 %>%
  mutate(census_tract = as.numeric(census_tract))
tract_studyarea_selected <- tract_studyarea %>%
  select(GEOID_trimmed, TotalPoverty, MaleBachelors, FemaleBachelors, TotalPop) %>%
  st_drop_geometry()  

property1_blockgroup <- st_join(property1, block2023_studyarea %>% select(GEOID, MedHHInc, MedRent), 
                            join = st_intersects)

property1_geometry <- st_geometry(property1)

tract_studyarea_selected <- tract_studyarea_selected %>%
  distinct(GEOID_trimmed, .keep_all = TRUE)
property1 <- property1_blockgroup %>%
  st_drop_geometry() %>% 
  left_join(tract_studyarea_selected, by = c("census_tract" = "GEOID_trimmed"))

property1 <- property1 %>%
  mutate(objectid = as.character(objectid))

property2 <- property2 %>%
  mutate(objectid = as.character(objectid))

property_combined <- property1 %>%
  left_join(property2, by = "objectid") %>%
  rename(sale_price = sale_price.y)%>%
  select(-sale_price.x)

Third Party Data

# POI-restaurant-drink-dessert
api_key <- "AIzaSyCa-yuy1Y3n07URlR2HHg6dp_cIEouv9hs"
keywords <- c("restaurant", "dim sum restaurant", "kitchen", "desert","tea", "bubble tea","matcha", "herbal tea")
place_types <- c("restaurant", "cafe", "bakery")

studyarea_proj <- st_transform(block2023_studyarea, 2272)

grid_pts <- st_make_grid(studyarea_proj, cellsize = 2000, what = "centers")
grid_pts <- st_sf(geometry = grid_pts) %>%
  st_filter(st_buffer(studyarea_proj, 1000)) 

grid_pts_wgs <- st_transform(grid_pts, 4326)
coords <- st_coordinates(grid_pts_wgs)

all_responses <- list()

for (pt_idx in seq_len(nrow(coords))) {
  location <- coords[pt_idx, c(2,1)]  # lat, lng
  cat(" Querying center point", pt_idx, "at", paste(location, collapse = ", "), "\n")
  
  for (ptype in place_types) {
    for (kw in keywords) {
      Sys.sleep(5)  
      cat("🔎", kw, "in", ptype, "\n")

      response <- google_places(
        keyword = kw,
        location = location,
        radius = 2000,  
        place_type = ptype,
        key = api_key
      )

      if (is.null(response[["results"]]) || length(response[["results"]]) == 0) {
        cat("No results for:", kw, "in", ptype, "\n")
        next
      }

      page_token <- response$next_page_token
      results_list <- list(response[["results"]])
      i <- 1

      while (!is.null(page_token)) {
        Sys.sleep(10)
        response_next <- google_places(
          keyword = kw,
          location = location,
          radius = 2000,
          place_type = ptype,
          key = api_key,
          page_token = page_token
        )

        if (is.null(response_next[["results"]]) || length(response_next[["results"]]) == 0) break

        results_list[[i + 1]] <- response_next[["results"]]
        page_token <- response_next$next_page_token
        i <- i + 1
      }

      all_responses[[paste(kw, ptype, pt_idx, sep = "_")]] <- bind_rows(results_list) %>%
        filter(!is.na(place_id)) %>%
        distinct(place_id, .keep_all = TRUE)
    }
  }
}

all_responses <- all_responses[lengths(all_responses) > 0]
restaurant_results <- bind_rows(all_responses) %>%
  mutate(lat = geometry$location$lat, lng = geometry$location$lng) %>%
  select(lat, lng, name, place_id, rating) %>%
  distinct(place_id, .keep_all = TRUE)

restaurant_sf <- st_as_sf(restaurant_results, coords = c("lng", "lat"), crs = 4326)
restaurant_proj <- st_transform(restaurant_sf, 2272)

studyarea_buffer <- st_buffer(studyarea_proj, 2000)
res_poi <- st_filter(restaurant_proj, studyarea_buffer)
radius_circles <- st_buffer(st_transform(grid_pts, 2272), dist = 2000)

# POI - grocery
keywords <- c("grocery store", "supermarket", "asian grocery", "chinese supermarket",
              "korean market", "japanese grocery", "international market", "food market")

place_types <- c("supermarket", "grocery_or_supermarket", "convenience_store", "store")

studyarea_proj <- st_transform(block2023_studyarea, 2272)

grid_pts <- st_make_grid(studyarea_proj, cellsize = 2000, what = "centers")
grid_pts <- st_sf(geometry = grid_pts) %>%
  st_filter(st_buffer(studyarea_proj, 1000))

grid_pts_wgs <- st_transform(grid_pts, 4326)
coords <- st_coordinates(grid_pts_wgs)

all_responses <- list()

for (pt_idx in seq_len(nrow(coords))) {
  location <- coords[pt_idx, c(2,1)]  # lat, lng
  cat(" Querying center", pt_idx, "at", paste(location, collapse = ", "), "\n")
  
  for (ptype in place_types) {
    for (kw in keywords) {
      Sys.sleep(5)  
      cat("🔎", kw, "in", ptype, "\n")

      response <- google_places(
        keyword = kw,
        location = location,
        radius = 2000,
        place_type = ptype,
        key = api_key
      )

      if (is.null(response[["results"]]) || length(response[["results"]]) == 0) next

      page_token <- response$next_page_token
      results_list <- list(response[["results"]])
      i <- 1

      while (!is.null(page_token)) {
        Sys.sleep(10)
        response_next <- google_places(
          keyword = kw,
          location = location,
          radius = 2000,
          place_type = ptype,
          key = api_key,
          page_token = page_token
        )
        if (is.null(response_next[["results"]]) || length(response_next[["results"]]) == 0) break

        results_list[[i + 1]] <- response_next[["results"]]
        page_token <- response_next$next_page_token
        i <- i + 1
      }

      all_responses[[paste(kw, ptype, pt_idx, sep = "_")]] <- bind_rows(results_list) %>%
        filter(!is.na(place_id)) %>%
        distinct(place_id, .keep_all = TRUE)
    }
  }
}

all_responses <- all_responses[lengths(all_responses) > 0]

grocery_results <- bind_rows(all_responses) %>%
  mutate(lat = geometry$location$lat,
         lng = geometry$location$lng) %>%
  select(lat, lng, name, place_id, rating) %>%
  distinct(place_id, .keep_all = TRUE)

grocery_sf <- st_as_sf(grocery_results, coords = c("lng", "lat"), crs = 4326)
grocery_proj <- st_transform(grocery_sf, 2272)

studyarea_buffer <- st_buffer(studyarea_proj, 2000)
grocery <- st_filter(grocery_proj, studyarea_buffer)

# property join poi
property_sf <- st_as_sf(property_combined, sf_column_name = "geometry")
property_proj <- st_transform(property_sf, 2272)
grocery_proj <- st_transform(grocery, 2272)
restaurant_proj <- st_transform(res_poi, 2272)

grocery_idx <- st_nearest_feature(property_proj, grocery_proj)

property_proj$nearest_grocery_name <- grocery_proj$name[grocery_idx]
property_proj$nearest_grocery_dist <- st_distance(
  property_proj,
  grocery_proj[grocery_idx, ],
  by_element = TRUE
)
restaurant_idx <- st_nearest_feature(property_proj, restaurant_proj)
property_proj$nearest_restaurant_name <- restaurant_proj$name[restaurant_idx]
property_proj$nearest_restaurant_dist <- st_distance(
  property_proj,
  restaurant_proj[restaurant_idx, ],
  by_element = TRUE
)
property_proj$nearest_grocery_dist_m <- as.numeric(property_proj$nearest_grocery_dist)
property_proj$nearest_restaurant_dist_m <- as.numeric(property_proj$nearest_restaurant_dist)
clean_property_poi_distances <- function(property_sf) {
  property_sf %>%
    mutate(
      nearest_grocery_dist_m = as.numeric(nearest_grocery_dist),
      nearest_restaurant_dist_m = as.numeric(nearest_restaurant_dist)) %>%
    select(
      -nearest_grocery_dist,
      -nearest_restaurant_dist
    )
}
property_final <- clean_property_poi_distances(property_proj)
#st_write(property_final, "property_final.geojson", driver = "GeoJSON", delete_dsn = TRUE)

Exploratory Data Analysis

Highway Impacts

1. Linear Relationship: Sale Price vs. Distance to I-676 by Zoning

property_filtered <- property_final
property_filtered <- st_as_sf(property_filtered, sf_column_name = "geometry")

avg_price_zoning <- property_filtered %>%
  st_drop_geometry() %>%  
  group_by(zoning) %>%
  summarise(avg_price_zoning = mean(adj_sale_price, na.rm = TRUE)) %>%
  ungroup()

property_filtered <- property_filtered %>%
  left_join(avg_price_zoning, by = "zoning")%>%
  filter(location != "711-39 SPRING GARDEN ST")

property_filtered <- property_filtered %>%
  mutate(zoning_group = case_when(
    zoning %in% c("CMX5", "ICMX") ~ "High-Density Commercial",
    zoning %in% c("IRMX") ~ "Residential Industrial",
    zoning %in% c("RM1", "RM4", "RMX3", "RSA5") ~ "Residential",
    zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "Neighborhood Commercial",
    zoning %in% c("CMX3", "CMX4") ~ "Community Commercial",
    TRUE ~ "Other"
  ))

avg_price_group <- property_filtered %>%
  st_drop_geometry() %>%
  group_by(zoning_group) %>%
  summarise(avg_adj_sale_price = mean(adj_sale_price, na.rm = TRUE)) %>%
  ungroup()

property_filtered <- property_filtered %>%
  left_join(avg_price_group, by = "zoning_group")


ggplot(property_filtered %>% filter(!is.na(zoning_group) & zoning_group != "Other"), 
       aes(x = log1p(distance_to_I676), y = log1p(adj_sale_price))) +
  geom_point(alpha = 0.5, size = 0.5, color = "#1B8370") + 
  geom_smooth(method = "lm", se = FALSE, color = "#E4572E") +
  facet_wrap(~ zoning_group, scales = "free") + 
  theme_minimal() +
  labs(title = "                      Zoning wise Relationship Between Distance to I-676 and Sales Price", 
       x = "Distance to I-676", 
       y = "Sale Price")

2. Linear Relationship: Sale Price vs. Distance to I-676 by Neighborhood

property_with_nhood <- st_join(property_filtered, nhood1, join = st_intersects)

nhood_summary <- property_with_nhood %>%
  group_by(NAME) %>%
  summarise(
    avg_sale_price = mean(adj_sale_price, na.rm = TRUE),
    avg_distance = mean(distance_to_I676, na.rm = TRUE)
  ) %>%
  ungroup()
nhood_map <- st_join(nhood1, nhood_summary, by = "NAME")

property_logansquare_split <- property_filtered %>%
  mutate(
    nhoods_name = case_when(
      GEOID %in% c("421010003003") ~ "Logan Square", 
      GEOID %in% c("421010125013", "421010125014") ~ NA_character_, 
      str_to_title(str_replace_all(nhoods_name, "_", " ")) == "Center City" ~ "Center City East (Near City Hall)", 
      TRUE ~ str_to_title(str_replace_all(nhoods_name, "_", " "))
    )
  ) %>%
  filter(!is.na(nhoods_name))

avg_price_nhoods <- property_logansquare_split %>%
  st_drop_geometry() %>%
  group_by(nhoods_name) %>%
  summarise(avg_sale_price = mean(adj_sale_price, na.rm = TRUE)) %>%
  ungroup()

highlight_nhoods <- c( "Callowhill", "West Poplar")
green_nhoods <- c("Center City East (Near City Hall)", "Chinatown", "Old City", "Logan Square")

property_logansquare_split$nhoods_name <- factor(
  property_logansquare_split$nhoods_name,
  levels = c(highlight_nhoods, green_nhoods)
)
ggplot(property_logansquare_split %>% filter(!is.na(nhoods_name)), 
       aes(x = log1p(distance_to_I676), y = log1p(adj_sale_price), 
           color = ifelse(nhoods_name %in% highlight_nhoods, "North Area", "South Area"))) +
  geom_point(alpha = 0.5, size = 0.5) +  
  geom_smooth(method = "lm", se = FALSE, color = "black") +  
  facet_wrap(~ nhoods_name, scales = "free", nrow = 2) +  
  scale_color_manual(values = c("North Area" = "#E4572E", "South Area" = "#1B8370")) +  
  theme_minimal() +
  theme(
    legend.position = "bottom",
    legend.justification = c(0.5, -0.2)
  ) +
  labs(
        title = "                   Neighborhood wise Relationship Between Distance to I-676 and Sales Price", 
       x = "Distance to I-676", 
       y = "Sale Price",
    color = "Neighborhood"
  )

4. Correlation Matrix

lm1_property <- property_final %>%
  st_drop_geometry() %>%  
  select(
    adj_sale_price,
    distance_to_I676,
    distance_to_city_hall,
    distance_to_nearest_transit,
    nearest_restaurant_dist_m,
    nearest_grocery_dist_m,
    distance_to_nearest_park,
    distance_to_nearest_water,
    total_livable_area,
    interior_condition,
    exterior_condition,
    number_of_bathrooms,
    number_of_bedrooms,
    garage_spaces,
    general_construction,
    crime_nn5
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  na.omit()

cor_matrix <- cor(lm1_property, use = "complete.obs")

var_labels <- c(
  "adj_sale_price" = "Sale Price",
  "distance_to_I676" = "Distance to Highway I-676",
  "distance_to_city_hall" = "Distance to City Hall",
  "distance_to_nearest_transit" = "Distance to Transit",
  "nearest_restaurant_dist_m" = "Distance to Restaurant",
  "nearest_grocery_dist_m" = "Distance to Grocery",
  "distance_to_nearest_park" = "Distance to Park",
  "distance_to_nearest_water" = "Distance to Water",
  "total_livable_area" = "Total Livable Area",
  "interior_condition" = "Interior Condition",
  "exterior_condition" = "Exterior Condition",
  "number_of_bathrooms" = "# of Bathrooms",
  "number_of_bedrooms" = "# of Bedrooms",
  "garage_spaces" = "# of Garage Spaces",
  "crime_nn5" = "Crime Rate",
  "general_construction" = "General Construction"
)

colnames(cor_matrix) <- var_labels[colnames(cor_matrix)]
rownames(cor_matrix) <- var_labels[rownames(cor_matrix)]

corrplot(
  cor_matrix,
  method = "color",
  col = colorRampPalette(c("#1B8370", "white", "#E46726"))(200),
  type = "lower",
  tl.pos = "lt",
  tl.col = "black",
  tl.cex = 0.6,
  mar = c(0, 0, 5, 0)
)

title(
  main = "Correlation Across Key Numeric Features",
  cex.main = 1.5,
  font.main = 2,
  line = 0.3,   
  adj = 0.6  
)

Feature Engineering

median_year <- median(property_final$year_built, na.rm = TRUE)
median_bedroom <- median(as.numeric(as.character(property_final$number_of_bedrooms)), na.rm = TRUE)
median_bathroom <- median(as.numeric(as.character(property_final$number_of_bathrooms)), na.rm = TRUE)

property_CT <- property_final %>%
  filter(zoning != "I2") %>%  
  mutate(
#year built
    year_built = as.numeric(year_built),
    year_built_missing = ifelse(is.na(year_built), 1, 0),
    year_built_filled = ifelse(is.na(year_built), median_year, year_built),
    year_built = case_when(
      is.na(year_built) ~ "Unknown",
      TRUE ~ as.character(year_built)
    ),
    year_built = factor(year_built),

#log 
    log_price = log(adj_sale_price),
    log_dist_highway = log(distance_to_I676 + 1),

    
# Central Air
    central_air = case_when(
      central_air %in% c("Y", "1") | is.na(central_air) ~ "Y",
      central_air %in% c("N", "0") ~ "N"
    ),
    central_air = factor(central_air, levels = c("Y", "N")),
    central_air_missing = ifelse(central_air == "Unknown", 1, 0),

# Garage Spaces
    garage_spaces = ifelse(garage_spaces %in% c(8, 10), NA, garage_spaces),
    garage_spaces_missing = ifelse(is.na(garage_spaces), 1, 0),
    garage_spaces = case_when(
      is.na(garage_spaces) ~ "Unknown",
      garage_spaces == 0 | garage_spaces == 1 ~ "less than 2",
      TRUE ~ "2 or more"
    ),
    garage_spaces = factor(garage_spaces, levels = c("less than 2", "2 or more", "Unknown")),

    # Bedrooms
    number_of_bedrooms = as.numeric(as.character(number_of_bedrooms)),
    number_of_bedrooms_missing = ifelse(is.na(number_of_bedrooms), 1, 0),
    number_of_bedrooms_filled = ifelse(is.na(number_of_bedrooms), median_bedroom, number_of_bedrooms),
    number_of_bedrooms = case_when(
      is.na(number_of_bedrooms) ~ "NA",
      number_of_bedrooms == 0 ~ "0",
      number_of_bedrooms == 1 ~ "1",
      number_of_bedrooms %in% c(2, 3, 4) ~ "2",
      number_of_bedrooms %in% c(5, 6) ~ "3",
      number_of_bedrooms %in% c(7, 9, 12) ~ "4",
      TRUE ~ "Other"
    ),
    number_of_bedrooms = factor(number_of_bedrooms),

    # Bathrooms
    number_of_bathrooms = as.numeric(as.character(number_of_bathrooms)),
    number_of_bathrooms_missing = ifelse(is.na(number_of_bathrooms), 1, 0),
    number_of_bathrooms_filled = ifelse(is.na(number_of_bathrooms), median_bathroom, number_of_bathrooms),
    number_of_bathrooms = case_when(
      number_of_bathrooms == 0 ~ "0",
      number_of_bathrooms == 1 ~ "1",
      number_of_bathrooms == 2 ~ "2",
      number_of_bathrooms %in% c(3, 4, 5) ~ "3-5",
      number_of_bathrooms == 6 ~ "6",
      number_of_bathrooms == 7 ~ "7",
      TRUE ~ "Unknown"
    ),
    number_of_bathrooms = factor(number_of_bathrooms),

    # total livable area
    total_livable_area = ifelse(is.na(total_livable_area) | total_livable_area == 1, 0, total_livable_area),

    # Quality Grade
    quality_grade = case_when(
      quality_grade %in% c("3 ", "7 ") ~ "Good",
      quality_grade %in% c("A", "A+", "A-", "C+", "C") | is.na(quality_grade) ~ "Average",
      TRUE ~ "Below Average"
    ),
    quality_grade = factor(quality_grade, levels = c("Good", "Average", "Below Average")),
    quality_grade_missing = ifelse(is.na(quality_grade), 1, 0),

    # Construction
    general_construction = as.character(general_construction),
    general_construction = case_when(
      is.na(general_construction) ~ "Unknown",
      TRUE ~ general_construction
    ),
    general_construction = factor(general_construction),
    general_construction_missing = ifelse(general_construction == "Unknown", 1, 0),

    # Separate Utilities
    separate_utilities = case_when(
      is.na(separate_utilities) ~ "Unknown",
      TRUE ~ separate_utilities
    ),
    separate_utilities = factor(separate_utilities, levels = c("A", "B", "C", "Unknown")),
    separate_utilities_missing = ifelse(separate_utilities == "Unknown", 1, 0),

    # Exterior Condition
    exterior_condition = case_when(
      exterior_condition == 2 ~ "Good",
      exterior_condition %in% c(1, 6, 7) ~ "Average",
      exterior_condition %in% c(3, 4, 5) ~ "Below Average",
      TRUE ~ "Unknown"
    ),
    exterior_condition = factor(exterior_condition),

    # Interior Condition
    interior_condition = case_when(
      interior_condition %in% c(2, NA) ~ "Good",
      interior_condition %in% c(1, 4, 7) ~ "Average",
      interior_condition %in% c(3, 5, 6) ~ "Below Average",
      TRUE ~ "Unknown"
    ),
    interior_condition = factor(interior_condition),

    # Category Code
    category_code_original = as.character(category_code),
    category_code_group = case_when(
      category_code == "12" ~ "6 ",
      category_code == "14" ~ "2 ",
      category_code == "10" ~ "4 ",
      category_code == "15" ~ "5 ",
      category_code == "16" ~ "4 ",
      category_code == "7 "  ~ "4 ",
      category_code == "8 "  ~ "6 ",
      category_code == "9 "  ~ "2 ",
      category_code %in% c("1 ", "2 ", "3 ", "4 ", "5 ", "6 ") ~ category_code,
      TRUE ~ "Other"
    ),
    category_code = factor(category_code_group),

    # Zoning
    zoning_group = case_when(
      zoning %in% c("CMX5") ~ "CMX5",
      zoning %in% c("IRMX") ~ "IRMX",
      zoning %in% c("RM1", "RM2", "RM4") ~ "RM",
      zoning %in% c("RMX3") ~ "RMX3",
      zoning %in% c("RSA5") ~ "RSA5",
      zoning %in% c("ICMX") ~ "ICMX",
      zoning %in% c("SPPOA") ~ "SPPOA",
      zoning %in% c("I2") ~ "I2",
      zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "CMX(1,2,2.5)",
      zoning %in% c("CMX3", "CMX4") ~ "CMX(3,4)",
      TRUE ~ "Other"
    ),
    zoning_group = factor(zoning_group),

    zoning = case_when(
      is.na(zoning) ~ "Unknown",
      TRUE ~ zoning
    ),
    zoning = factor(zoning),
    zip_code = factor(zip_code)
  )

mean_depth <- mean(property_CT$depth, na.rm = TRUE)
property_CT$depth[is.na(property_CT$depth)] <- mean_depth

st_write(property_CT, "property_CT_FE.geojson", driver = "GeoJSON", delete_dsn = TRUE)
## Deleting source `property_CT_FE.geojson' using driver `GeoJSON'
## Writing layer `property_CT_FE' to data source 
##   `property_CT_FE.geojson' using driver `GeoJSON'
## Writing 1594 features with 121 fields and geometry type Point.

Log Transformation

plot_df <- property_final %>%
  mutate(
    total_livable_area_log = log1p(total_livable_area) 
  )

plot1 <- ggplot(plot_df, aes(x = total_livable_area)) +
  geom_histogram(bins = 30, fill = "#1B8370", alpha = 0.8) +
  labs(title = "Original Total Livable Area",
        x = "Total Livable Area", 
        y = "Count") +
  xlim(0, quantile(plot_df$total_livable_area, 0.99, na.rm = TRUE)) + 
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, hjust = 0.5)
  )

plot2 <- ggplot(plot_df, aes(x = total_livable_area_log)) +
  geom_histogram(bins = 30, fill = "#1B8370", alpha = 0.8) +
  labs(title = "Log-transformed Total Livable Area",
        x = "(Log) Total Livable Area", 
        y = "Count")  +
  xlim(5, 9) +  
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, hjust = 0.5),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )
combined_plot <- plot1 + plot2 +
  plot_layout(widths = c(1, 1)) +
  plot_annotation(
    title = "Distribution of Total Livable Area",
    caption = "Figure X.X",
    theme = theme(
      plot.title = element_text(size = 16,face = "bold", hjust = 0.5),
      plot.caption = element_text(hjust = 0.5)
    )
  )
combined_plot

Reclassification

lm1_property <- property_final %>%
  mutate(
    interior_condition_recode = case_when(
      is.na(interior_condition) | interior_condition %in% c(2) ~ "Good",
      interior_condition %in% c(1,4,7)~ "Average",
      interior_condition %in% c(3, 5,6) ~ "Poor",
      TRUE ~ "Unknown"
    ),
    interior_condition_recode = factor(interior_condition_recode, levels = c("Good", "Average", "Poor", "Unknown"))
  )
plot1 <- ggplot(lm1_property, aes(x = factor(interior_condition))) +
  geom_bar(fill = "#1B8370", alpha = 0.8, width = 0.6) +
  ggtitle("Original Interior Condition") +
  labs(x = "Interior Condition Code", y = "Count") +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5))

plot2 <- lm1_property %>%
  group_by(interior_condition) %>%
  summarise(mean_log_price = mean(market_value, na.rm = TRUE)) %>%
  ggplot(aes(x = factor(interior_condition), y = mean_log_price)) +
  geom_col(fill = "#E46726", alpha = 0.8, width = 0.6) +
  ggtitle("Mean Sale Price by Original Condition") +
  labs(x = "Interior Condition Code", y = "Avg Sale Price ($)") +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5))

plot3 <- ggplot(lm1_property, aes(x = interior_condition_recode)) +
  geom_bar(fill = "#1B8370", alpha = 0.8, width = 0.6) +
  ggtitle("Reclassified Interior Condition") +
  labs(x = "Interior Condition Recode", y = "Count") +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5))

plot4 <- lm1_property %>%
  group_by(interior_condition_recode) %>%
  summarise(mean_log_price = mean(market_value, na.rm = TRUE)) %>%
  ggplot(aes(x = interior_condition_recode, y = mean_log_price)) +
  geom_col(fill = "#E46726", alpha = 0.8, width = 0.6) +
  ggtitle("Mean Sale Price by Reclassified Condition") +
  labs(x = "Interior Condition Recode", y = "Avg Sale Price ($)") +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5))

# Combine with patchwork + large centered title
combined_plot <- (plot1 + plot2 + plot_layout(tag_level = "new")) /
                 (plot3 + plot4 + plot_layout(tag_level = "new")) +
  plot_annotation(
    title = "Interior Condition and Sale Price",
    subtitle = "Before (Top) and After (Bottom) Reclassification",
    caption = "Figure X.X",
    theme = theme(
      plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
      plot.subtitle = element_text(size = 10, hjust = 0.5),
      plot.caption = element_text(hjust = 0.5),
      plot.title.position = "plot"
    )
  )
combined_plot

Comparing north and south - Continuous Variables

property_fe <- property_CT %>%
  st_drop_geometry() %>%
  mutate(
    log_sale_price = log1p(adj_sale_price),
    log_distance_to_I676 = log1p(distance_to_I676),
    log_distance_to_city_hall = log1p(distance_to_city_hall),
    log_total_livable_area = log1p(total_livable_area),
    I676_NS = as.character(I676_NS) 
  )

property_fe_selected <- property_fe %>%
  select(
    I676_NS,
    log_sale_price,
    log_distance_to_I676,
    log_distance_to_city_hall,
    log_total_livable_area
  ) %>%
  drop_na()

correlation_long <- property_fe_selected %>%
  pivot_longer(-c(log_sale_price, I676_NS), names_to = "Variable", values_to = "Value") %>%
  mutate(
    use_in_plot = case_when(
      Variable == "log_total_area" & Value <= log1p(10) ~ FALSE,
      Variable == "log_total_livable_area" & Value <= log1p(10) ~ FALSE,
      TRUE ~ TRUE
    )
  )

cor_selected <- correlation_long %>%
  filter(use_in_plot) %>%
  group_by(Variable, I676_NS) %>%
  summarise(r = cor(Value, log_sale_price, use = "complete.obs"), .groups = "drop") %>%
  mutate(
    x_pos = -Inf,
    y_pos = ifelse(I676_NS == "north", Inf, -Inf),
    vjust_pos = ifelse(I676_NS == "north", 1.1, -1.1)
  )

ggplot(correlation_long %>% filter(use_in_plot),
       aes(x = Value, y = log_sale_price)) +
  geom_point(aes(color = I676_NS), alpha = 0.4, size = 0.3) +
  geom_smooth(aes(color = I676_NS), method = "lm", se = FALSE, linewidth = 1) +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  scale_color_manual(values = c("north" = "#1a9988", "south" = "#eb5600")) +
  labs(
    title = "Log-transformed Sale Price vs Continuous Features by I-676 Side",
    caption = "Figure X.X",
    x = "Values of Continuous Features",
    y = "Log(Sale Price)",
    color = "I-676 Side"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 15, face = "bold",hjust = 0.5),
    strip.text = element_text(size = 10, face = "bold"),
    plot.caption = element_text(hjust = 0.5),
    legend.position = "bottom"
  )

Comparing north and south - Categorical Variables

# Prepare aggregated data by zoning group and I676 side
zoning_bar_data <- property_CT %>%
  st_drop_geometry() %>%
  filter(!is.na(zoning_group), !is.na(adj_sale_price), !is.na(I676_NS)) %>%
  group_by(zoning_group, I676_NS) %>%
  summarise(mean_sale_price = mean(adj_sale_price, na.rm = TRUE), .groups = "drop")

# Prepare aggregated data by quality grade and I676 side
quality_bar_data <- property_CT %>%
  st_drop_geometry() %>%
  filter(!is.na(quality_grade), !is.na(adj_sale_price), !is.na(I676_NS)) %>%
  group_by(quality_grade, I676_NS) %>%
  summarise(mean_sale_price = mean(adj_sale_price, na.rm = TRUE), .groups = "drop")

# Plot for Zoning Group
plot_zoning <- ggplot(zoning_bar_data, aes(x = zoning_group, y = mean_sale_price, fill = I676_NS)) +
  geom_col(position = "dodge", alpha = 0.9) +
  scale_fill_manual(values = c("north" = "#1a9988", "south" = "#eb5600")) +
  labs(
    title = "Average Sale Price by Zoning Group and I-676 Side",
    x = "Zoning Group",
    y = "Avg Sale Price ($)",
    fill = "I-676 Side"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12, hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 11)
  )

# Plot for Quality Grade
plot_quality <- ggplot(quality_bar_data, aes(x = factor(quality_grade), y = mean_sale_price, fill = I676_NS)) +
  geom_col(position = "dodge", alpha = 0.9) +
  scale_fill_manual(values = c("north" = "#1a9988", "south" = "#eb5600")) +
  labs(
    title = "Average Sale Price by Quality Grade and I-676 Side",
    x = "Quality Grade",
    y = "Avg Sale Price ($)",
    fill = "I-676 Side"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 10, hjust = 0.5),
    axis.text.x = element_text(hjust = 1, size = 8)
  )

# Hide zoning legend, keep quality legend
plot_zoning <- plot_zoning + theme(legend.position = "none")

# Combine
library(patchwork)
final_plot <- plot_zoning + plot_quality + plot_layout(ncol = 2)
final_plot

zoning_group_colors <- c(
  "RM" = "#F9844A",                 
  "RSA5" = "#E89972",             
  "RMX3" = "#BFD3C1",             
  "CMX5" = "#3B7D64",             
  "ICMX" = "#004B40",              
  "IRMX" = "#FFDAB9",             
  "RMX1" = "#F9C74F",             
  "SPPOA" = "#F8961E", 
  "CMX(1,2,2.5)" = "#43AA8B",
  "CMX(3,4)" = "#90BE6D",
  "Other" = "grey80"
)

ggplot() +
  geom_sf(data = block2023_studyarea, fill = "grey95", color = "grey80", size = 0.3) +
  geom_sf(data = I676_2, color = "red", size = 1, alpha = 0.9) +
  geom_sf(data = property_CT %>% filter(!is.na(zoning_group)),
          aes(color = zoning_group), size = 1, alpha = 0.7) +
  scale_color_manual(values = zoning_group_colors, na.translate = FALSE) +
  labs(
    title = "Zoning Groups of Properties within Study Area",
    subtitle = "Study Area Around I676",
    caption = "Figure X.X",
    color = "Zoning Group"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12),
    plot.caption = element_text(hjust = 0.5),
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 8),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

ggplot() +
  geom_sf(data = block2023_studyarea, fill = "grey95", color = "grey80", size = 0.3) +
  geom_sf(data = I676_2, color = "red", size = 1, alpha = 0.9) +
  geom_sf(
    data = property_CT %>% filter(!is.na(LPSS_PER1000)),
    aes(color = LPSS_PER1000),
    size = 1,
    alpha = 0.8
  ) +
  scale_color_gradientn(
    colours = c("#FFDDAB", "#E89972", "#BFD3C1", "#3B7D64", "#004B40"),
    name = "LPSS Score\n(per 1000)",
    na.value = "grey70"
  ) +

  labs(
    title = "Low Produce Supply Score (LPSS) within Study Area",
    subtitle = "Darker = Higher LPSS, i.e., More Limited Access",
    caption = "Figure X.X"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12),
    plot.caption = element_text(hjust = 0.5),
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 8),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

MODEL

lm_CT <- lm(
  log(adj_sale_price) ~ log_dist_highway  + log(distance_to_city_hall +1) + log (distance_to_nearest_transit +1) + log (nearest_restaurant_dist_m +1)  + log(distance_to_nearest_park +1) + zoning + zip_code + exterior_condition + interior_condition + log1p(total_livable_area) + number_of_bathrooms  + number_of_bedrooms + quality_grade + log(distance_to_nearest_water+1) + central_air + year_built + year_built_missing  + general_construction + garage_spaces + separate_utilities  + category_code + crime_nn5  +LPSS_PER1000 +general_construction_missing+ number_of_bathrooms_missing + number_of_bedrooms_missing + separate_utilities_missing+ separate_utilities_missing,  
          data = property_CT)

model_summary <- tidy(lm_CT) %>%
  mutate(
    p.value = signif(p.value, 3),
    estimate = round(estimate, 3),
    std.error = round(std.error, 3),
    statistic = round(statistic, 2),
    sig = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ ".",
      TRUE ~ ""
    )
  ) %>%
  select(term, estimate, std.error, statistic, p.value, sig)

glance_stats <- glance(lm_CT)

model_stats <- tibble(
  term = c("R-squared", "Adj. R-squared", "Num. Obs."),
  estimate = c(round(glance_stats$r.squared, 3),
               round(glance_stats$adj.r.squared, 3),
               glance_stats$nobs),
  std.error = NA,
  statistic = NA,
  p.value = NA,
  sig = ""
)

model_full <- bind_rows(model_summary, model_stats)

html_table <- model_full %>%
  kable(format = "html", caption = "OLS Regression Results", digits = 3) %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE, background = "#1a9988", color = "white")

save_kable(html_table, file = "ols_table.html")
webshot2::webshot("ols_table.html", file = "ols_table.png", vwidth = 1200, vheight = 3000)

OLS predictive model

set.seed(123)
split <- initial_split(property_CT, prop = 0.8)  # 80% train, 20% test

train_data <- training(split)
test_data <- testing(split)
lm_train <- lm(
  log(adj_sale_price) ~ 
    log_dist_highway + 
    log(distance_to_city_hall + 1) + 
    log(distance_to_nearest_transit + 1) + 
    log(nearest_restaurant_dist_m + 1) + 
    log(distance_to_nearest_park + 1) + 
    zoning + 
    zip_code + 
    exterior_condition + 
    interior_condition + 
    log1p(total_livable_area) + 
    number_of_bathrooms + 
    number_of_bedrooms + 
    quality_grade + 
    log(distance_to_nearest_water + 1) + 
    central_air + 
#    year_built_group 
#    year_built_missing + 
    general_construction + 
    garage_spaces + 
    separate_utilities + 
    crime_nn5 + 
    LPSS_PER1000 + 
    general_construction_missing + 
    number_of_bathrooms_missing + 
    number_of_bedrooms_missing + 
    separate_utilities_missing,
  data = train_data
)
test_data$log_pred <- predict(lm_train, newdata = test_data)
test_data$price_pred <- exp(test_data$log_pred) - 1

test_data <- test_data %>%
  mutate(
    abs_error = abs(price_pred - adj_sale_price),
    pct_error = abs_error / adj_sale_price
  )
mean(test_data$pct_error, na.rm = TRUE) 
## [1] 0.5506515
median(test_data$pct_error, na.rm = TRUE) 
## [1] 0.3354613
ggplot(test_data, aes(x = price_pred, y = adj_sale_price)) +
  geom_point(alpha = 0.4) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Test Set: Predicted vs Actual Sale Price",
    x = "Predicted Price",
    y = "Actual Price"
  ) +
  theme_minimal()

rmse_value <- sqrt(mean((test_data$price_pred - test_data$adj_sale_price)^2, na.rm = TRUE))
mae_value <- mean(abs(test_data$price_pred - test_data$adj_sale_price), na.rm = TRUE)
property_predict <- property_CT %>%
  mutate(
    log_price = log(adj_sale_price),
    log_pred = predict(lm_train, newdata = .)
  )
comparison_df <- property_predict %>%
  select(location, log_price, log_pred)